#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

#include "cv.h"
#include "highgui.h"
#include "opencv2/highgui/highgui.hpp"
#include "opencv2/core/core.hpp"
#include "opencv/cv.hpp"

using namespace cv;
using namespace std;

#include "calc_sobel.hpp"
#include <stdlib.h>
#include "common.h"

#define TEST_TIMES 20

void dumpImageDataToFile(float * dataIn, char * strFileName, int width, int height)
{
	FILE * fp = fopen(strFileName,"w");

	char strToWrite[1024];
	strToWrite[0] = '\0';
	for (int i = 0; i < height; i++)
	{
		for (int j = 0; j < width; j++)
		{
			sprintf(strToWrite, "%s %f", strToWrite, dataIn[i*width+j]);
			if (j % 20 == 19)
			{
				sprintf(strToWrite, "%s\n", strToWrite);
			}
			if (strlen(strToWrite) >= 900)
			{
				fwrite(strToWrite, strlen(strToWrite), 1, fp);
				memset(strToWrite, 0, 1024);
			}
		}
		sprintf(strToWrite, "%s\n\n", strToWrite);
	}
	if (strlen(strToWrite))
	{
		fwrite(strToWrite, strlen(strToWrite), 1, fp);
	}

}

template <typename T>
void dumpImageDataToFile(T * dataIn, char * strFileName, int width, int height)
{
	FILE * fp = fopen(strFileName, "w");

	char strToWrite[1024];
	strToWrite[0] = '\0';
	for (int i = 0; i < height; i++)
	{
		for (int j = 0; j < width; j++)
		{
			sprintf(strToWrite, "%s %d", strToWrite, dataIn[i*width + j]);
			if (j % 20 == 19)
			{
				sprintf(strToWrite, "%s\n", strToWrite);
			}
			if (strlen(strToWrite) >= 900)
			{
				fwrite(strToWrite, strlen(strToWrite), 1, fp);
				memset(strToWrite, 0, 1024);
			}
		}
		sprintf(strToWrite, "%s\n\n", strToWrite);
	}
	if (strlen(strToWrite))
	{
		fwrite(strToWrite, strlen(strToWrite), 1, fp);
	}

}

int main(int argc, char *argv[])
{
	if (argc < 2)
	{
		printf("No parameter for image file name!\n");
		return -1;
	}
	Mat image;
	image = imread(argv[1], IMREAD_GRAYSCALE);

	if (!image.data)
	{
		printf("No image data \n");
		return -1;
	}

	uchar * devBuff;
	int devBuffBlockNum;
	int pitch;
	int ret = init_cuda(image.cols, image.rows, image.data, &devBuff, &devBuffBlockNum, &pitch);
	printf("sizeof int[%d],ret is %d,pitch is %d, width is %d, height is %d\n",sizeof(int), ret, pitch, image.cols,image.rows);

	//cudaMemcpy2D(image.data, image.cols, devBuff, pitch, image.cols, image.rows,cudaMemcpyDeviceToHost);
	cudaMemcpy(devBuff + 1024*1024, image.data, image.rows*image.cols, cudaMemcpyHostToDevice);

	double dStart = 0;
	double dEnd = 0;
	double sumTime = 0;
	double avgTime = 0;
	
	/***********test for sobel************************/
	short * dx_result = (short*)malloc(sizeof(short)*image.cols*image.rows);
	short * dy_result = (short*)malloc(sizeof(short)*image.cols*image.rows);
	int * dnorm_result = (int*)malloc(sizeof(int)*image.cols*image.rows);

	sumTime = 0;
	for (int i = 0; i < 20;i++)
	{
		memset(dx_result, 0, sizeof(short)*image.cols*image.rows);
		memset(dy_result, 0, sizeof(short)*image.cols*image.rows);
		memset(dnorm_result, 0, sizeof(int)*image.cols*image.rows);

		dStart = getCpuClock();
		calc_sobel_cuda_v1(devBuff + 1024 * 1024, image.cols, image.rows, devBuff + (1024 * 1024<<1), dx_result, dy_result, dnorm_result);

		//calc_sobel_cuda(devBuff, image.cols, image.rows, devBuff + 1024 * 1024, pitch, dx_result, dy_result, dnorm_result);

		//calc_sobel_cuda(devBuff, image.cols, image.rows, devBuff + pitch*image.rows, pitch, dx_result, dy_result, dnorm_result);
		dEnd = getCpuClock();

		sumTime = sumTime + dEnd - dStart;
	}	

	printf("calc_sobel_cuda cost %fs\n", sumTime/20);
	
	dumpImageDataToFile(dx_result, "cudaDx_short.txt", image.cols, image.rows);
	//int pitch1 = (((image.cols + 2) + 7) >> 3) << 3;
	//dumpImageDataToFile(dx_result, "cudaDx_short_t.txt", pitch1, 4);
	//dumpImageDataToFile((uchar*)dy_result, "expanded_src.txt", image.cols + 2, 4);
	//dumpImageDataToFile(image.data, "img_data_src.txt", image.cols +2, 4);

	free(dx_result);
	free(dy_result);
	free(dnorm_result);

	/*float * dx_result = (float*)malloc(sizeof(float)*image.cols*image.rows);
	float * dy_result = (float*)malloc(sizeof(float)*image.cols*image.rows);
	float * dnorm_result = (float*)malloc(sizeof(float)*image.cols*image.rows);

	dStart = getCpuClock();
	calc_sobel_cuda(devBuff, image.cols, image.rows, devBuff + pitch*image.rows, pitch, dx_result, dy_result, dnorm_result);
	dEnd = getCpuClock();

	printf("calc_sobel_cuda cost %fs\n", dEnd - dStart);
	//printFloatArray(dx_result, image.cols);
	dumpImageDataToFile(dx_result, "cudaDx.txt", image.cols, image.rows);

	free(dx_result);
	free(dy_result);
	free(dnorm_result);
	*/

	Mat Dx,Dy;
	sumTime = 0;
	for (int i = 0; i < 1; i++)
	{
		Dx.release();
		Dy.release();
		dStart = getCpuClock();
		Sobel(image, Dx, CV_16S, 1, 0, 3, 1, 0, BORDER_DEFAULT);
		Sobel(image, Dy, CV_16S, 0, 1, 3, 1, 0, BORDER_DEFAULT);
		dEnd = getCpuClock();
		sumTime = sumTime + dEnd - dStart;
		
	}
	printf("opencv sobel cost %fs\n", sumTime/1);
	dumpImageDataToFile((short*)Dx.data, "opencvDx_short.txt", image.cols, image.rows);
	Dx.release();
	Dy.release();

	deinit_cuda(devBuff, &devBuffBlockNum, &pitch);
	return 0;
}

